#include "def_arm64.S"
#if defined(__arm64__)

//*************************************************************************************************
//void tx_dct2_pb4_arm64(s16 *src, s16 *dst, int line, int limit_line, int shift);
//x0: coeff blk, 16 bit
//x1: resi blk, 16 bit
//x2: blk width
//x3: limit_line
//x4: shift
//*************************************************************************************************
function tx_dct2_pb4_arm64
    lsl x2, x2, #1

    //transcode coeffs
    mov w5, #32
    neg w6, w5
    mov w7, #42
    neg w8, w7
    mov w9, #17
    neg w10, w9
    mov v4.h[0], w5
    mov v4.h[1], w7
    mov v4.h[2], w5
    mov v4.h[3], w9         //32 42 32 17
    mov v5.h[0], w5
    mov v5.h[1], w9
    mov v5.h[2], w6
    mov v5.h[3], w8         //32 17 -32 -42
    mov v6.h[0], w5
    mov v6.h[1], w10
    mov v6.h[2], w6
    mov v6.h[3], w7         //32 -17 -32 42
    mov v7.h[0], w5
    mov v7.h[1], w8
    mov v7.h[2], w5
    mov v7.h[3], w10        //32 -42 32 -17

    mov x6, #0

tx_dct2_pb4_loop:
    //load src
    ld1 {v0.4h - v3.4h}, [x0], #32
//    ld1 {v0.d}[0], [x0], x2
//    ld1 {v1.d}[0], [x0], x2
//    ld1 {v2.d}[0], [x0], x2
//    ld1 {v3.d}[0], [x0], x2

    smull v16.4s, v4.4h, v0.h[0]
    smlal v16.4s, v5.4h, v0.h[1]
    smlal v16.4s, v6.4h, v0.h[2]
    smlal v16.4s, v7.4h, v0.h[3]
    smull v17.4s, v4.4h, v1.h[0]
    smlal v17.4s, v5.4h, v1.h[1]
    smlal v17.4s, v6.4h, v1.h[2]
    smlal v17.4s, v7.4h, v1.h[3]
    smull v18.4s, v4.4h, v2.h[0]
    smlal v18.4s, v5.4h, v2.h[1]
    smlal v18.4s, v6.4h, v2.h[2]
    smlal v18.4s, v7.4h, v2.h[3]
    smull v19.4s, v4.4h, v3.h[0]
    smlal v19.4s, v5.4h, v3.h[1]
    smlal v19.4s, v6.4h, v3.h[2]
    smlal v19.4s, v7.4h, v3.h[3]

    cmp x4, #0
    beq tx_dct2_pb4_shift0
    cmp x4, #7
    beq tx_dct2_pb4_shift7
    sqrshrn v16.4h, v16.4s, #2
    sqrshrn v17.4h, v17.4s, #2
    sqrshrn v18.4h, v18.4s, #2
    sqrshrn v19.4h, v19.4s, #2
    b tx_dct2_pb4_end

tx_dct2_pb4_shift0:
    sqxtn v16.4h, v16.4s
    sqxtn v17.4h, v17.4s
    sqxtn v18.4h, v18.4s
    sqxtn v19.4h, v19.4s
    b tx_dct2_pb4_end

tx_dct2_pb4_shift7:
    sqrshrn v16.4h, v16.4s, #7
    sqrshrn v17.4h, v17.4s, #7
    sqrshrn v18.4h, v18.4s, #7
    sqrshrn v19.4h, v19.4s, #7

tx_dct2_pb4_end:
    add x5, x1, x6
    st4 {v16.h - v19.h}[0], [x5], x2
    st4 {v16.h - v19.h}[1], [x5], x2
    st4 {v16.h - v19.h}[2], [x5], x2
    st4 {v16.h - v19.h}[3], [x5], x2

    add x6, x6, #8

    subs x3, x3, #4
    bgt tx_dct2_pb4_loop

ret


tx_dct2_pb8_coef:
.hword 32, 44, 42, 38, 32, 25, 17, 9, \
       32, 38, 17, -9, -32, -44, -42, -25, \
       32, 25, -17, -44, -32, 9, 42, 38, \
       32, 9, -42, -25, 32, 38, -17, -44, \
       32, -9, -42, 25, 32, -38, -17, 44, \
       32, -25, -17, 44, -32, -9, 42, -38, \
       32, -38, 17, 9, -32, 44, -42, 25, \
       32, -44, 42, -38, 32, -25, 17, -9

//*************************************************************************************************
//void tx_dct2_pb8_arm64(s16 *src, s16 *dst, int line, int limit_line, int shift);
//x0: coeff blk, 16 bit
//x1: resi blk, 16 bit
//x2: blk width
//x3: limit_line
//x4: shift
//*************************************************************************************************
function tx_dct2_pb8_arm64
    lsl x2, x2, #1
    mov x6, #0

    adr x7, tx_dct2_pb8_coef
    ld1 {v16.8h - v19.8h}, [x7], #64
    ld1 {v20.8h - v23.8h}, [x7]

tx_dct2_pb8_loop:
    //load src
    ld1 {v0.8h - v3.8h}, [x0], #64

    smull v24.4s, v16.4h, v0.h[0]
    smull2 v25.4s, v16.8h, v0.h[0]
    smlal v24.4s, v17.4h, v0.h[1]
    smlal2 v25.4s, v17.8h, v0.h[1]
    smlal v24.4s, v18.4h, v0.h[2]
    smlal2 v25.4s, v18.8h, v0.h[2]
    smlal v24.4s, v19.4h, v0.h[3]
    smlal2 v25.4s, v19.8h, v0.h[3]
    smlal v24.4s, v20.4h, v0.h[4]
    smlal2 v25.4s, v20.8h, v0.h[4]
    smlal v24.4s, v21.4h, v0.h[5]
    smlal2 v25.4s, v21.8h, v0.h[5]
    smlal v24.4s, v22.4h, v0.h[6]
    smlal2 v25.4s, v22.8h, v0.h[6]
    smlal v24.4s, v23.4h, v0.h[7]
    smlal2 v25.4s, v23.8h, v0.h[7]

    smull v26.4s, v16.4h, v1.h[0]
    smull2 v27.4s, v16.8h, v1.h[0]
    smlal v26.4s, v17.4h, v1.h[1]
    smlal2 v27.4s, v17.8h, v1.h[1]
    smlal v26.4s, v18.4h, v1.h[2]
    smlal2 v27.4s, v18.8h, v1.h[2]
    smlal v26.4s, v19.4h, v1.h[3]
    smlal2 v27.4s, v19.8h, v1.h[3]
    smlal v26.4s, v20.4h, v1.h[4]
    smlal2 v27.4s, v20.8h, v1.h[4]
    smlal v26.4s, v21.4h, v1.h[5]
    smlal2 v27.4s, v21.8h, v1.h[5]
    smlal v26.4s, v22.4h, v1.h[6]
    smlal2 v27.4s, v22.8h, v1.h[6]
    smlal v26.4s, v23.4h, v1.h[7]
    smlal2 v27.4s, v23.8h, v1.h[7]

    smull v28.4s, v16.4h, v2.h[0]
    smull2 v29.4s, v16.8h, v2.h[0]
    smlal v28.4s, v17.4h, v2.h[1]
    smlal2 v29.4s, v17.8h, v2.h[1]
    smlal v28.4s, v18.4h, v2.h[2]
    smlal2 v29.4s, v18.8h, v2.h[2]
    smlal v28.4s, v19.4h, v2.h[3]
    smlal2 v29.4s, v19.8h, v2.h[3]
    smlal v28.4s, v20.4h, v2.h[4]
    smlal2 v29.4s, v20.8h, v2.h[4]
    smlal v28.4s, v21.4h, v2.h[5]
    smlal2 v29.4s, v21.8h, v2.h[5]
    smlal v28.4s, v22.4h, v2.h[6]
    smlal2 v29.4s, v22.8h, v2.h[6]
    smlal v28.4s, v23.4h, v2.h[7]
    smlal2 v29.4s, v23.8h, v2.h[7]

    smull v30.4s, v16.4h, v3.h[0]
    smull2 v31.4s, v16.8h, v3.h[0]
    smlal v30.4s, v17.4h, v3.h[1]
    smlal2 v31.4s, v17.8h, v3.h[1]
    smlal v30.4s, v18.4h, v3.h[2]
    smlal2 v31.4s, v18.8h, v3.h[2]
    smlal v30.4s, v19.4h, v3.h[3]
    smlal2 v31.4s, v19.8h, v3.h[3]
    smlal v30.4s, v20.4h, v3.h[4]
    smlal2 v31.4s, v20.8h, v3.h[4]
    smlal v30.4s, v21.4h, v3.h[5]
    smlal2 v31.4s, v21.8h, v3.h[5]
    smlal v30.4s, v22.4h, v3.h[6]
    smlal2 v31.4s, v22.8h, v3.h[6]
    smlal v30.4s, v23.4h, v3.h[7]
    smlal2 v31.4s, v23.8h, v3.h[7]

    cmp w4, #8
    beq tx_dct2_pb8_shift8
    cmp w4, #1
    beq tx_dct2_pb8_shift1

    sqrshrn v0.4h, v24.4s, #3
    sqrshrn v1.4h, v26.4s, #3
    sqrshrn v2.4h, v28.4s, #3
    sqrshrn v3.4h, v30.4s, #3
    sqrshrn v4.4h, v25.4s, #3
    sqrshrn v5.4h, v27.4s, #3
    sqrshrn v6.4h, v29.4s, #3
    sqrshrn v7.4h, v31.4s, #3
    b tx_dct2_pb8_end

tx_dct2_pb8_shift1:
    sqrshrn v0.4h, v24.4s, #1
    sqrshrn v1.4h, v26.4s, #1
    sqrshrn v2.4h, v28.4s, #1
    sqrshrn v3.4h, v30.4s, #1
    sqrshrn v4.4h, v25.4s, #1
    sqrshrn v5.4h, v27.4s, #1
    sqrshrn v6.4h, v29.4s, #1
    sqrshrn v7.4h, v31.4s, #1
    b tx_dct2_pb8_end

tx_dct2_pb8_shift8:
    sqrshrn v0.4h, v24.4s, #8
    sqrshrn v1.4h, v26.4s, #8
    sqrshrn v2.4h, v28.4s, #8
    sqrshrn v3.4h, v30.4s, #8
    sqrshrn v4.4h, v25.4s, #8
    sqrshrn v5.4h, v27.4s, #8
    sqrshrn v6.4h, v29.4s, #8
    sqrshrn v7.4h, v31.4s, #8

tx_dct2_pb8_end:
    add x5, x1, x6
    st4 {v0.h - v3.h}[0], [x5], x2
    st4 {v0.h - v3.h}[1], [x5], x2
    st4 {v0.h - v3.h}[2], [x5], x2
    st4 {v0.h - v3.h}[3], [x5], x2
    st4 {v4.h - v7.h}[0], [x5], x2
    st4 {v4.h - v7.h}[1], [x5], x2
    st4 {v4.h - v7.h}[2], [x5], x2
    st4 {v4.h - v7.h}[3], [x5], x2
    add x6, x6, #8

    subs x3, x3, #4
    bgt tx_dct2_pb8_loop

    ret


tx_dct2_pb16_coef:
.hword 32, 45, 44, 43, 42, 40, 38, 35, 32, 29, 25, 21, 17, 13, 9, 4, \
       32, 43, 38, 29, 17, 4, -9, -21, -32, -40, -44, -45, -42, -35, -25, -13, \
       32, 40, 25, 4, -17, -35, -44, -43, -32, -13, 9, 29, 42, 45, 38, 21, \
       32, 35, 9, -21, -42, -43, -25, 4, 32, 45, 38, 13, -17, -40, -44, -29, \
       32, 29, -9, -40, -42, -13, 25, 45, 32, -4, -38, -43, -17, 21, 44, 35, \
       32, 21, -25, -45, -17, 29, 44, 13, -32, -43, -9, 35, 42, 4, -38, -40, \
       32, 13, -38, -35, 17, 45, 9, -40, -32, 21, 44, 4, -42, -29, 25, 43, \
       32, 4, -44, -13, 42, 21, -38, -29, 32, 35, -25, -40, 17, 43, -9, -45, \
       32, -4, -44, 13, 42, -21, -38, 29, 32, -35, -25, 40, 17, -43, -9, 45, \
       32, -13, -38, 35, 17, -45, 9, 40, -32, -21, 44, -4, -42, 29, 25, -43, \
       32, -21, -25, 45, -17, -29, 44, -13, -32, 43, -9, -35, 42, -4, -38, 40, \
       32, -29, -9, 40, -42, 13, 25, -45, 32, 4, -38, 43, -17, -21, 44, -35, \
       32, -35, 9, 21, -42, 43, -25, -4, 32, -45, 38, -13, -17, 40, -44, 29, \
       32, -40, 25, -4, -17, 35, -44, 43, -32, 13, 9, -29, 42, -45, 38, -21, \
       32, -43, 38, -29, 17, -4, -9, 21, -32, 40, -44, 45, -42, 35, -25, 13, \
       32, -45, 44, -43, 42, -40, 38, -35, 32, -29, 25, -21, 17, -13, 9, -4

//*************************************************************************************************
//void tx_dct2_pb16_arm64(s16 *src, s16 *dst, int line, int limit_line, int shift);
//x0: coeff blk, 16 bit
//x1: resi blk, 16 bit
//x2: blk width
//x3: limit_line
//x4: shift
//*************************************************************************************************
function tx_dct2_pb16_arm64
    lsl x2, x2, #1
    adr x7, tx_dct2_pb16_coef
    mov x12, #32    //i_src

tx_dct2_pb16_loopk:
    mov x8, #0
    mov x11, x1
tx_dct2_pb16_loopj:
    mov x5, #0
    movi v24.16b, #0
    movi v25.16b, #0
    movi v26.16b, #0
    movi v27.16b, #0
    movi v28.16b, #0
    movi v29.16b, #0
    movi v30.16b, #0
    movi v31.16b, #0
tx_dct2_pb16_loopi:
    //load src
    add x6, x0, x5
    ld1 {v0.8h}, [x6], x12
    ld1 {v1.8h}, [x6], x12
    ld1 {v2.8h}, [x6], x12
    ld1 {v3.8h}, [x6], x12

    add x9, x7, x8
    ld1 {v16.8h}, [x9], x12
    ld1 {v17.8h}, [x9], x12
    ld1 {v18.8h}, [x9], x12
    ld1 {v19.8h}, [x9], x12
    ld1 {v20.8h}, [x9], x12
    ld1 {v21.8h}, [x9], x12
    ld1 {v22.8h}, [x9], x12
    ld1 {v23.8h}, [x9], x12

    smlal v24.4s, v16.4h, v0.h[0]
    smlal2 v25.4s, v16.8h, v0.h[0]
    smlal v24.4s, v17.4h, v0.h[1]
    smlal2 v25.4s, v17.8h, v0.h[1]
    smlal v24.4s, v18.4h, v0.h[2]
    smlal2 v25.4s, v18.8h, v0.h[2]
    smlal v24.4s, v19.4h, v0.h[3]
    smlal2 v25.4s, v19.8h, v0.h[3]
    smlal v24.4s, v20.4h, v0.h[4]
    smlal2 v25.4s, v20.8h, v0.h[4]
    smlal v24.4s, v21.4h, v0.h[5]
    smlal2 v25.4s, v21.8h, v0.h[5]
    smlal v24.4s, v22.4h, v0.h[6]
    smlal2 v25.4s, v22.8h, v0.h[6]
    smlal v24.4s, v23.4h, v0.h[7]
    smlal2 v25.4s, v23.8h, v0.h[7]

    smlal v26.4s, v16.4h, v1.h[0]
    smlal2 v27.4s, v16.8h, v1.h[0]
    smlal v26.4s, v17.4h, v1.h[1]
    smlal2 v27.4s, v17.8h, v1.h[1]
    smlal v26.4s, v18.4h, v1.h[2]
    smlal2 v27.4s, v18.8h, v1.h[2]
    smlal v26.4s, v19.4h, v1.h[3]
    smlal2 v27.4s, v19.8h, v1.h[3]
    smlal v26.4s, v20.4h, v1.h[4]
    smlal2 v27.4s, v20.8h, v1.h[4]
    smlal v26.4s, v21.4h, v1.h[5]
    smlal2 v27.4s, v21.8h, v1.h[5]
    smlal v26.4s, v22.4h, v1.h[6]
    smlal2 v27.4s, v22.8h, v1.h[6]
    smlal v26.4s, v23.4h, v1.h[7]
    smlal2 v27.4s, v23.8h, v1.h[7]

    smlal v28.4s, v16.4h, v2.h[0]
    smlal2 v29.4s, v16.8h, v2.h[0]
    smlal v28.4s, v17.4h, v2.h[1]
    smlal2 v29.4s, v17.8h, v2.h[1]
    smlal v28.4s, v18.4h, v2.h[2]
    smlal2 v29.4s, v18.8h, v2.h[2]
    smlal v28.4s, v19.4h, v2.h[3]
    smlal2 v29.4s, v19.8h, v2.h[3]
    smlal v28.4s, v20.4h, v2.h[4]
    smlal2 v29.4s, v20.8h, v2.h[4]
    smlal v28.4s, v21.4h, v2.h[5]
    smlal2 v29.4s, v21.8h, v2.h[5]
    smlal v28.4s, v22.4h, v2.h[6]
    smlal2 v29.4s, v22.8h, v2.h[6]
    smlal v28.4s, v23.4h, v2.h[7]
    smlal2 v29.4s, v23.8h, v2.h[7]

    smlal v30.4s, v16.4h, v3.h[0]
    smlal2 v31.4s, v16.8h, v3.h[0]
    smlal v30.4s, v17.4h, v3.h[1]
    smlal2 v31.4s, v17.8h, v3.h[1]
    smlal v30.4s, v18.4h, v3.h[2]
    smlal2 v31.4s, v18.8h, v3.h[2]
    smlal v30.4s, v19.4h, v3.h[3]
    smlal2 v31.4s, v19.8h, v3.h[3]
    smlal v30.4s, v20.4h, v3.h[4]
    smlal2 v31.4s, v20.8h, v3.h[4]
    smlal v30.4s, v21.4h, v3.h[5]
    smlal2 v31.4s, v21.8h, v3.h[5]
    smlal v30.4s, v22.4h, v3.h[6]
    smlal2 v31.4s, v22.8h, v3.h[6]
    smlal v30.4s, v23.4h, v3.h[7]
    smlal2 v31.4s, v23.8h, v3.h[7]

    add x5, x5, #16
    add x8, x8, #256
    cmp x5, #16
    beq tx_dct2_pb16_loopi

    cmp w4, #9
    beq tx_dct2_pb16_shift9
    cmp w4, #2
    beq tx_dct2_pb16_shift2
    sqrshrn v0.4h, v24.4s, #4
    sqrshrn v1.4h, v26.4s, #4
    sqrshrn v2.4h, v28.4s, #4
    sqrshrn v3.4h, v30.4s, #4
    sqrshrn v4.4h, v25.4s, #4
    sqrshrn v5.4h, v27.4s, #4
    sqrshrn v6.4h, v29.4s, #4
    sqrshrn v7.4h, v31.4s, #4
    b tx_dct2_pb16_end

tx_dct2_pb16_shift2:
    sqrshrn v0.4h, v24.4s, #2
    sqrshrn v1.4h, v26.4s, #2
    sqrshrn v2.4h, v28.4s, #2
    sqrshrn v3.4h, v30.4s, #2
    sqrshrn v4.4h, v25.4s, #2
    sqrshrn v5.4h, v27.4s, #2
    sqrshrn v6.4h, v29.4s, #2
    sqrshrn v7.4h, v31.4s, #2
    b tx_dct2_pb16_end

tx_dct2_pb16_shift9:
    sqrshrn v0.4h, v24.4s, #9
    sqrshrn v1.4h, v26.4s, #9
    sqrshrn v2.4h, v28.4s, #9
    sqrshrn v3.4h, v30.4s, #9
    sqrshrn v4.4h, v25.4s, #9
    sqrshrn v5.4h, v27.4s, #9
    sqrshrn v6.4h, v29.4s, #9
    sqrshrn v7.4h, v31.4s, #9

tx_dct2_pb16_end:
    st4 {v0.h - v3.h}[0], [x11], x2
    st4 {v0.h - v3.h}[1], [x11], x2
    st4 {v0.h - v3.h}[2], [x11], x2
    st4 {v0.h - v3.h}[3], [x11], x2
    st4 {v4.h - v7.h}[0], [x11], x2
    st4 {v4.h - v7.h}[1], [x11], x2
    st4 {v4.h - v7.h}[2], [x11], x2
    st4 {v4.h - v7.h}[3], [x11], x2

    sub x8, x8, #496
    cmp x8, #16
    beq tx_dct2_pb16_loopj

    add x0, x0, #128
    add x1, x1, #8
    subs x3, x3, #4
    bgt tx_dct2_pb16_loopk

    ret

tx_dct2_pb32_coef:
.hword 32, 45, 45, 45, 44, 44, 43, 43, 42, 41, 40, 39, 38, 36, 35, 34, 32, 30, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 4, 2, \
       32, 45, 43, 41, 38, 34, 29, 23, 17, 11, 4, -2, -9, -15, -21, -27, -32, -36, -40, -43, -44, -45, -45, -44, -42, -39, -35, -30, -25, -19, -13, -7, \
       32, 44, 40, 34, 25, 15, 4, -7, -17, -27, -35, -41, -44, -45, -43, -39, -32, -23, -13, -2, 9, 19, 29, 36, 42, 45, 45, 43, 38, 30, 21, 11, \
       32, 43, 35, 23, 9, -7, -21, -34, -42, -45, -43, -36, -25, -11, 4, 19, 32, 41, 45, 44, 38, 27, 13, -2, -17, -30, -40, -45, -44, -39, -29, -15, \
       32, 41, 29, 11, -9, -27, -40, -45, -42, -30, -13, 7, 25, 39, 45, 43, 32, 15, -4, -23, -38, -45, -43, -34, -17, 2, 21, 36, 44, 44, 35, 19, \
       32, 39, 21, -2, -25, -41, -45, -36, -17, 7, 29, 43, 44, 34, 13, -11, -32, -44, -43, -30, -9, 15, 35, 45, 42, 27, 4, -19, -38, -45, -40, -23, \
       32, 36, 13, -15, -38, -45, -35, -11, 17, 39, 45, 34, 9, -19, -40, -45, -32, -7, 21, 41, 44, 30, 4, -23, -42, -44, -29, -2, 25, 43, 43, 27, \
       32, 34, 4, -27, -44, -39, -13, 19, 42, 43, 21, -11, -38, -45, -29, 2, 32, 45, 35, 7, -25, -44, -40, -15, 17, 41, 43, 23, -9, -36, -45, -30, \
       32, 30, -4, -36, -44, -23, 13, 41, 42, 15, -21, -44, -38, -7, 29, 45, 32, -2, -35, -45, -25, 11, 40, 43, 17, -19, -43, -39, -9, 27, 45, 34, \
       32, 27, -13, -43, -38, -2, 35, 44, 17, -23, -45, -30, 9, 41, 40, 7, -32, -45, -21, 19, 44, 34, -4, -39, -42, -11, 29, 45, 25, -15, -43, -36, \
       32, 23, -21, -45, -25, 19, 45, 27, -17, -45, -29, 15, 44, 30, -13, -44, -32, 11, 43, 34, -9, -43, -35, 7, 42, 36, -4, -41, -38, 2, 40, 39, \
       32, 19, -29, -44, -9, 36, 40, -2, -42, -34, 13, 45, 25, -23, -45, -15, 32, 43, 4, -39, -38, 7, 43, 30, -17, -45, -21, 27, 44, 11, -35, -41, \
       32, 15, -35, -39, 9, 45, 21, -30, -42, 2, 43, 27, -25, -44, -4, 41, 32, -19, -45, -11, 38, 36, -13, -45, -17, 34, 40, -7, -44, -23, 29, 43, \
       32, 11, -40, -30, 25, 43, -4, -45, -17, 36, 35, -19, -44, -2, 43, 23, -32, -39, 13, 45, 9, -41, -29, 27, 42, -7, -45, -15, 38, 34, -21, -44, \
       32, 7, -43, -19, 38, 30, -29, -39, 17, 44, -4, -45, -9, 43, 21, -36, -32, 27, 40, -15, -44, 2, 45, 11, -42, -23, 35, 34, -25, -41, 13, 45, \
       32, 2, -45, -7, 44, 11, -43, -15, 42, 19, -40, -23, 38, 27, -35, -30, 32, 34, -29, -36, 25, 39, -21, -41, 17, 43, -13, -44, 9, 45, -4, -45, \
       32, -2, -45, 7, 44, -11, -43, 15, 42, -19, -40, 23, 38, -27, -35, 30, 32, -34, -29, 36, 25, -39, -21, 41, 17, -43, -13, 44, 9, -45, -4, 45, \
       32, -7, -43, 19, 38, -30, -29, 39, 17, -44, -4, 45, -9, -43, 21, 36, -32, -27, 40, 15, -44, -2, 45, -11, -42, 23, 35, -34, -25, 41, 13, -45, \
       32, -11, -40, 30, 25, -43, -4, 45, -17, -36, 35, 19, -44, 2, 43, -23, -32, 39, 13, -45, 9, 41, -29, -27, 42, 7, -45, 15, 38, -34, -21, 44, \
       32, -15, -35, 39, 9, -45, 21, 30, -42, -2, 43, -27, -25, 44, -4, -41, 32, 19, -45, 11, 38, -36, -13, 45, -17, -34, 40, 7, -44, 23, 29, -43, \
       32, -19, -29, 44, -9, -36, 40, 2, -42, 34, 13, -45, 25, 23, -45, 15, 32, -43, 4, 39, -38, -7, 43, -30, -17, 45, -21, -27, 44, -11, -35, 41, \
       32, -23, -21, 45, -25, -19, 45, -27, -17, 45, -29, -15, 44, -30, -13, 44, -32, -11, 43, -34, -9, 43, -35, -7, 42, -36, -4, 41, -38, -2, 40, -39, \
       32, -27, -13, 43, -38, 2, 35, -44, 17, 23, -45, 30, 9, -41, 40, -7, -32, 45, -21, -19, 44, -34, -4, 39, -42, 11, 29, -45, 25, 15, -43, 36, \
       32, -30, -4, 36, -44, 23, 13, -41, 42, -15, -21, 44, -38, 7, 29, -45, 32, 2, -35, 45, -25, -11, 40, -43, 17, 19, -43, 39, -9, -27, 45, -34, \
       32, -34, 4, 27, -44, 39, -13, -19, 42, -43, 21, 11, -38, 45, -29, -2, 32, -45, 35, -7, -25, 44, -40, 15, 17, -41, 43, -23, -9, 36, -45, 30, \
       32, -36, 13, 15, -38, 45, -35, 11, 17, -39, 45, -34, 9, 19, -40, 45, -32, 7, 21, -41, 44, -30, 4, 23, -42, 44, -29, 2, 25, -43, 43, -27, \
       32, -39, 21, 2, -25, 41, -45, 36, -17, -7, 29, -43, 44, -34, 13, 11, -32, 44, -43, 30, -9, -15, 35, -45, 42, -27, 4, 19, -38, 45, -40, 23, \
       32, -41, 29, -11, -9, 27, -40, 45, -42, 30, -13, -7, 25, -39, 45, -43, 32, -15, -4, 23, -38, 45, -43, 34, -17, -2, 21, -36, 44, -44, 35, -19, \
       32, -43, 35, -23, 9, 7, -21, 34, -42, 45, -43, 36, -25, 11, 4, -19, 32, -41, 45, -44, 38, -27, 13, 2, -17, 30, -40, 45, -44, 39, -29, 15, \
       32, -44, 40, -34, 25, -15, 4, 7, -17, 27, -35, 41, -44, 45, -43, 39, -32, 23, -13, 2, 9, -19, 29, -36, 42, -45, 45, -43, 38, -30, 21, -11, \
       32, -45, 43, -41, 38, -34, 29, -23, 17, -11, 4, 2, -9, 15, -21, 27, -32, 36, -40, 43, -44, 45, -45, 44, -42, 39, -35, 30, -25, 19, -13, 7, \
       32, -45, 45, -45, 44, -44, 43, -43, 42, -41, 40, -39, 38, -36, 35, -34, 32, -30, 29, -27, 25, -23, 21, -19, 17, -15, 13, -11, 9, -7, 4, -2
//*************************************************************************************************
//void tx_dct2_pb32_arm64(s16 *src, s16 *dst, int line, int limit_line, int shift);
//x0: coeff blk, 16 bit
//x1: resi blk, 16 bit
//x2: blk width
//x3: limit_line
//x4: shift
//*************************************************************************************************
function tx_dct2_pb32_arm64
    lsl x2, x2, #1
    adr x7, tx_dct2_pb32_coef
    mov x12, #64    //i_src

tx_dct2_pb32_loopk:
    mov x8, #0
    mov x11, x1
tx_dct2_pb32_loopj:
    mov x5, #0
    movi v24.16b, #0
    movi v25.16b, #0
    movi v26.16b, #0
    movi v27.16b, #0
    movi v28.16b, #0
    movi v29.16b, #0
    movi v30.16b, #0
    movi v31.16b, #0
tx_dct2_pb32_loopi:
    //load src
    add x6, x0, x5
    ld1 {v0.8h}, [x6], x12
    ld1 {v1.8h}, [x6], x12
    ld1 {v2.8h}, [x6], x12
    ld1 {v3.8h}, [x6], x12

    add x9, x7, x8
    ld1 {v16.8h}, [x9], x12
    ld1 {v17.8h}, [x9], x12
    ld1 {v18.8h}, [x9], x12
    ld1 {v19.8h}, [x9], x12
    ld1 {v20.8h}, [x9], x12
    ld1 {v21.8h}, [x9], x12
    ld1 {v22.8h}, [x9], x12
    ld1 {v23.8h}, [x9], x12

    smlal v24.4s, v16.4h, v0.h[0]
    smlal2 v25.4s, v16.8h, v0.h[0]
    smlal v24.4s, v17.4h, v0.h[1]
    smlal2 v25.4s, v17.8h, v0.h[1]
    smlal v24.4s, v18.4h, v0.h[2]
    smlal2 v25.4s, v18.8h, v0.h[2]
    smlal v24.4s, v19.4h, v0.h[3]
    smlal2 v25.4s, v19.8h, v0.h[3]
    smlal v24.4s, v20.4h, v0.h[4]
    smlal2 v25.4s, v20.8h, v0.h[4]
    smlal v24.4s, v21.4h, v0.h[5]
    smlal2 v25.4s, v21.8h, v0.h[5]
    smlal v24.4s, v22.4h, v0.h[6]
    smlal2 v25.4s, v22.8h, v0.h[6]
    smlal v24.4s, v23.4h, v0.h[7]
    smlal2 v25.4s, v23.8h, v0.h[7]

    smlal v26.4s, v16.4h, v1.h[0]
    smlal2 v27.4s, v16.8h, v1.h[0]
    smlal v26.4s, v17.4h, v1.h[1]
    smlal2 v27.4s, v17.8h, v1.h[1]
    smlal v26.4s, v18.4h, v1.h[2]
    smlal2 v27.4s, v18.8h, v1.h[2]
    smlal v26.4s, v19.4h, v1.h[3]
    smlal2 v27.4s, v19.8h, v1.h[3]
    smlal v26.4s, v20.4h, v1.h[4]
    smlal2 v27.4s, v20.8h, v1.h[4]
    smlal v26.4s, v21.4h, v1.h[5]
    smlal2 v27.4s, v21.8h, v1.h[5]
    smlal v26.4s, v22.4h, v1.h[6]
    smlal2 v27.4s, v22.8h, v1.h[6]
    smlal v26.4s, v23.4h, v1.h[7]
    smlal2 v27.4s, v23.8h, v1.h[7]

    smlal v28.4s, v16.4h, v2.h[0]
    smlal2 v29.4s, v16.8h, v2.h[0]
    smlal v28.4s, v17.4h, v2.h[1]
    smlal2 v29.4s, v17.8h, v2.h[1]
    smlal v28.4s, v18.4h, v2.h[2]
    smlal2 v29.4s, v18.8h, v2.h[2]
    smlal v28.4s, v19.4h, v2.h[3]
    smlal2 v29.4s, v19.8h, v2.h[3]
    smlal v28.4s, v20.4h, v2.h[4]
    smlal2 v29.4s, v20.8h, v2.h[4]
    smlal v28.4s, v21.4h, v2.h[5]
    smlal2 v29.4s, v21.8h, v2.h[5]
    smlal v28.4s, v22.4h, v2.h[6]
    smlal2 v29.4s, v22.8h, v2.h[6]
    smlal v28.4s, v23.4h, v2.h[7]
    smlal2 v29.4s, v23.8h, v2.h[7]

    smlal v30.4s, v16.4h, v3.h[0]
    smlal2 v31.4s, v16.8h, v3.h[0]
    smlal v30.4s, v17.4h, v3.h[1]
    smlal2 v31.4s, v17.8h, v3.h[1]
    smlal v30.4s, v18.4h, v3.h[2]
    smlal2 v31.4s, v18.8h, v3.h[2]
    smlal v30.4s, v19.4h, v3.h[3]
    smlal2 v31.4s, v19.8h, v3.h[3]
    smlal v30.4s, v20.4h, v3.h[4]
    smlal2 v31.4s, v20.8h, v3.h[4]
    smlal v30.4s, v21.4h, v3.h[5]
    smlal2 v31.4s, v21.8h, v3.h[5]
    smlal v30.4s, v22.4h, v3.h[6]
    smlal2 v31.4s, v22.8h, v3.h[6]
    smlal v30.4s, v23.4h, v3.h[7]
    smlal2 v31.4s, v23.8h, v3.h[7]

    add x5, x5, #16
    add x8, x8, #512
    cmp x5, #64
    bne tx_dct2_pb32_loopi

    cmp w4, #10
    beq tx_dct2_pb32_shift10
    cmp w4, #3
    beq tx_dct2_pb32_shift3
    sqrshrn v0.4h, v24.4s, #5
    sqrshrn v1.4h, v26.4s, #5
    sqrshrn v2.4h, v28.4s, #5
    sqrshrn v3.4h, v30.4s, #5
    sqrshrn v4.4h, v25.4s, #5
    sqrshrn v5.4h, v27.4s, #5
    sqrshrn v6.4h, v29.4s, #5
    sqrshrn v7.4h, v31.4s, #5
    b tx_dct2_pb32_end

tx_dct2_pb32_shift3:
    sqrshrn v0.4h, v24.4s, #3
    sqrshrn v1.4h, v26.4s, #3
    sqrshrn v2.4h, v28.4s, #3
    sqrshrn v3.4h, v30.4s, #3
    sqrshrn v4.4h, v25.4s, #3
    sqrshrn v5.4h, v27.4s, #3
    sqrshrn v6.4h, v29.4s, #3
    sqrshrn v7.4h, v31.4s, #3
    b tx_dct2_pb32_end

tx_dct2_pb32_shift10:
    sqrshrn v0.4h, v24.4s, #10
    sqrshrn v1.4h, v26.4s, #10
    sqrshrn v2.4h, v28.4s, #10
    sqrshrn v3.4h, v30.4s, #10
    sqrshrn v4.4h, v25.4s, #10
    sqrshrn v5.4h, v27.4s, #10
    sqrshrn v6.4h, v29.4s, #10
    sqrshrn v7.4h, v31.4s, #10

tx_dct2_pb32_end:
    st4 {v0.h - v3.h}[0], [x11], x2
    st4 {v0.h - v3.h}[1], [x11], x2
    st4 {v0.h - v3.h}[2], [x11], x2
    st4 {v0.h - v3.h}[3], [x11], x2
    st4 {v4.h - v7.h}[0], [x11], x2
    st4 {v4.h - v7.h}[1], [x11], x2
    st4 {v4.h - v7.h}[2], [x11], x2
    st4 {v4.h - v7.h}[3], [x11], x2

    sub x8, x8, #2032
    cmp x8, #64
    bne tx_dct2_pb32_loopj

    add x0, x0, #256
    add x1, x1, #8
    subs x3, x3, #4
    bgt tx_dct2_pb32_loopk

    ret

tx_dct2_pb64_coef:
.hword 32, 45, 45, 45, 45, 45, 45, 45, 44, 44, 44, 44, 43, 43, 43, 42, 42, 41, 41, 40, 40, 39, 39, 38, 38, 37, 36, 36, 35, 34, 34, 33, \
       32, 31, 30, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 4, 3, 2, 1, \
       32, 45, 45, 44, 43, 42, 41, 39, 38, 36, 34, 31, 29, 26, 23, 20, 17, 14, 11, 8, 4, 1, -2, -6, -9, -12, -15, -18, -21, -24, -27, -30, \
       -32, -34, -36, -38, -40, -41, -43, -44, -44, -45, -45, -45, -45, -45, -44, -43, -42, -40, -39, -37, -35, -33, -30, -28, -25, -22, -19, -16, -13, -10, -7, -3, \
       32, 45, 44, 42, 40, 37, 34, 30, 25, 20, 15, 10, 4, -1, -7, -12, -17, -22, -27, -31, -35, -38, -41, -43, -44, -45, -45, -45, -43, -41, -39, -36, \
       -32, -28, -23, -18, -13, -8, -2, 3, 9, 14, 19, 24, 29, 33, 36, 39, 42, 44, 45, 45, 45, 44, 43, 40, 38, 34, 30, 26, 21, 16, 11, 6, \
       32, 45, 43, 39, 35, 30, 23, 16, 9, 1, -7, -14, -21, -28, -34, -38, -42, -44, -45, -45, -43, -40, -36, -31, -25, -18, -11, -3, 4, 12, 19, 26, \
       32, 37, 41, 44, 45, 45, 44, 41, 38, 33, 27, 20, 13, 6, -2, -10, -17, -24, -30, -36, -40, -43, -45, -45, -44, -42, -39, -34, -29, -22, -15, -8, \
       32, 44, 41, 36, 29, 20, 11, 1, -9, -18, -27, -34, -40, -44, -45, -45, -42, -37, -30, -22, -13, -3, 7, 16, 25, 33, 39, 43, 45, 45, 43, 38, \
       32, 24, 15, 6, -4, -14, -23, -31, -38, -42, -45, -45, -43, -39, -34, -26, -17, -8, 2, 12, 21, 30, 36, 41, 44, 45, 44, 40, 35, 28, 19, 10, \
       32, 44, 39, 31, 21, 10, -2, -14, -25, -34, -41, -45, -45, -42, -36, -28, -17, -6, 7, 18, 29, 37, 43, 45, 44, 40, 34, 24, 13, 1, -11, -22, \
       -32, -39, -44, -45, -43, -38, -30, -20, -9, 3, 15, 26, 35, 41, 45, 45, 42, 36, 27, 16, 4, -8, -19, -30, -38, -43, -45, -44, -40, -33, -23, -12, \
       32, 43, 36, 26, 13, -1, -15, -28, -38, -44, -45, -42, -35, -24, -11, 3, 17, 30, 39, 44, 45, 41, 34, 22, 9, -6, -19, -31, -40, -45, -45, -40, \
       -32, -20, -7, 8, 21, 33, 41, 45, 44, 39, 30, 18, 4, -10, -23, -34, -42, -45, -44, -38, -29, -16, -2, 12, 25, 36, 43, 45, 43, 37, 27, 14, \
       32, 42, 34, 20, 4, -12, -27, -38, -44, -45, -39, -28, -13, 3, 19, 33, 42, 45, 43, 34, 21, 6, -11, -26, -38, -44, -45, -39, -29, -14, 2, 18, \
       32, 41, 45, 43, 35, 22, 7, -10, -25, -37, -44, -45, -40, -30, -15, 1, 17, 31, 41, 45, 43, 36, 23, 8, -9, -24, -36, -44, -45, -40, -30, -16, \
       32, 41, 30, 14, -4, -22, -36, -44, -44, -37, -23, -6, 13, 30, 41, 45, 42, 31, 15, -3, -21, -36, -44, -45, -38, -24, -7, 12, 29, 40, 45, 42, \
       32, 16, -2, -20, -35, -44, -45, -38, -25, -8, 11, 28, 40, 45, 43, 33, 17, -1, -19, -34, -43, -45, -39, -26, -9, 10, 27, 39, 45, 43, 34, 18, \
       32, 40, 27, 8, -13, -31, -43, -45, -38, -22, -2, 18, 35, 44, 44, 34, 17, -3, -23, -38, -45, -42, -30, -12, 9, 28, 41, 45, 40, 26, 7, -14, \
       -32, -43, -45, -37, -21, -1, 19, 36, 44, 44, 34, 16, -4, -24, -39, -45, -42, -30, -11, 10, 29, 41, 45, 39, 25, 6, -15, -33, -43, -45, -36, -20, \
       32, 39, 23, 1, -21, -38, -45, -40, -25, -3, 19, 37, 45, 41, 27, 6, -17, -36, -45, -42, -29, -8, 15, 34, 44, 43, 30, 10, -13, -33, -44, -44, \
       -32, -12, 11, 31, 43, 44, 34, 14, -9, -30, -43, -45, -35, -16, 7, 28, 42, 45, 36, 18, -4, -26, -41, -45, -38, -20, 2, 24, 40, 45, 39, 22, \
       32, 38, 19, -6, -29, -43, -44, -31, -9, 16, 36, 45, 40, 22, -2, -26, -42, -45, -34, -12, 13, 34, 45, 41, 25, 1, -23, -40, -45, -36, -15, 10, \
       32, 44, 43, 28, 4, -20, -39, -45, -38, -18, 7, 30, 43, 44, 30, 8, -17, -37, -45, -39, -21, 3, 27, 42, 44, 33, 11, -14, -35, -45, -41, -24, \
       32, 37, 15, -12, -35, -45, -39, -18, 9, 33, 45, 40, 21, -6, -30, -44, -42, -24, 2, 28, 43, 43, 27, 1, -25, -42, -44, -30, -4, 22, 41, 45, \
       32, 8, -19, -39, -45, -34, -11, 16, 38, 45, 36, 14, -13, -36, -45, -38, -17, 10, 34, 45, 40, 20, -7, -31, -44, -41, -23, 3, 29, 44, 43, 26, \
       32, 36, 11, -18, -40, -45, -30, -3, 25, 43, 43, 24, -4, -31, -45, -39, -17, 12, 36, 45, 35, 10, -19, -40, -44, -30, -2, 26, 43, 42, 23, -6, \
       -32, -45, -39, -16, 13, 37, 45, 34, 9, -20, -41, -44, -29, -1, 27, 44, 42, 22, -7, -33, -45, -38, -15, 14, 38, 45, 34, 8, -21, -41, -44, -28, \
       32, 34, 7, -24, -43, -41, -19, 12, 38, 45, 30, 1, -29, -45, -39, -14, 17, 40, 44, 26, -4, -33, -45, -36, -9, 22, 43, 42, 21, -10, -36, -45, \
       -32, -3, 27, 44, 40, 16, -15, -39, -44, -28, 2, 31, 45, 37, 11, -20, -42, -43, -23, 8, 35, 45, 34, 6, -25, -44, -41, -18, 13, 38, 45, 30, \
       32, 33, 2, -30, -45, -36, -7, 26, 44, 38, 11, -22, -43, -40, -15, 18, 42, 42, 19, -14, -40, -44, -23, 10, 38, 45, 27, -6, -35, -45, -30, 1, \
       32, 45, 34, 3, -29, -45, -36, -8, 25, 44, 39, 12, -21, -43, -41, -16, 17, 41, 43, 20, -13, -39, -44, -24, 9, 37, 45, 28, -4, -34, -45, -31, \
       32, 31, -2, -34, -45, -28, 7, 37, 44, 24, -11, -39, -43, -20, 15, 41, 42, 16, -19, -43, -40, -12, 23, 44, 38, 8, -27, -45, -35, -3, 30, 45, \
       32, -1, -34, -45, -29, 6, 36, 45, 25, -10, -39, -44, -21, 14, 41, 42, 17, -18, -43, -40, -13, 22, 44, 38, 9, -26, -45, -36, -4, 30, 45, 33, \
       32, 30, -7, -38, -43, -18, 19, 44, 38, 6, -30, -45, -29, 8, 39, 43, 17, -20, -44, -37, -4, 31, 45, 28, -9, -39, -43, -16, 21, 44, 36, 3, \
       -32, -45, -27, 10, 40, 42, 15, -22, -44, -36, -2, 33, 45, 26, -11, -40, -42, -14, 23, 45, 35, 1, -34, -45, -25, 12, 41, 41, 13, -24, -45, -34, \
       32, 28, -11, -41, -40, -8, 30, 45, 25, -14, -43, -38, -4, 33, 45, 22, -17, -44, -36, -1, 35, 44, 19, -20, -44, -34, 2, 37, 43, 16, -23, -45, \
       -32, 6, 39, 42, 13, -26, -45, -30, 9, 40, 41, 10, -29, -45, -27, 12, 42, 39, 7, -31, -45, -24, 15, 43, 38, 3, -34, -45, -21, 18, 44, 36, \
       32, 26, -15, -44, -35, 3, 39, 41, 9, -31, -45, -20, 21, 45, 30, -10, -42, -38, -2, 36, 43, 14, -27, -45, -25, 16, 44, 34, -4, -39, -41, -8, \
       32, 45, 19, -22, -45, -30, 11, 42, 38, 1, -36, -43, -13, 28, 45, 24, -17, -44, -34, 6, 40, 40, 7, -33, -44, -18, 23, 45, 29, -12, -43, -37, \
       32, 24, -19, -45, -29, 14, 44, 33, -9, -42, -36, 3, 40, 39, 2, -37, -42, -8, 34, 44, 13, -30, -45, -18, 25, 45, 23, -20, -45, -28, 15, 44, \
       32, -10, -43, -36, 4, 40, 39, 1, -38, -41, -7, 34, 43, 12, -30, -45, -17, 26, 45, 22, -21, -45, -27, 16, 44, 31, -11, -43, -35, 6, 41, 38, \
       32, 22, -23, -45, -21, 24, 45, 20, -25, -45, -19, 26, 45, 18, -27, -45, -17, 28, 45, 16, -29, -45, -15, 30, 44, 14, -30, -44, -13, 31, 44, 12, \
       -32, -44, -11, 33, 43, 10, -34, -43, -9, 34, 43, 8, -35, -42, -7, 36, 42, 6, -36, -41, -4, 37, 41, 3, -38, -40, -2, 38, 40, 1, -39, -39, \
       32, 20, -27, -45, -13, 33, 43, 6, -38, -39, 2, 41, 35, -10, -44, -30, 17, 45, 23, -24, -45, -16, 30, 44, 9, -36, -41, -1, 40, 37, -7, -43, \
       -32, 14, 45, 26, -21, -45, -19, 28, 44, 12, -34, -42, -4, 38, 39, -3, -42, -34, 11, 44, 29, -18, -45, -22, 25, 45, 15, -31, -43, -8, 36, 40, \
       32, 18, -30, -43, -4, 39, 36, -10, -44, -26, 23, 45, 13, -34, -41, 1, 42, 33, -15, -45, -21, 28, 44, 8, -38, -38, 7, 44, 29, -20, -45, -16, \
       32, 42, 2, -40, -35, 12, 45, 24, -25, -45, -11, 36, 40, -3, -43, -31, 17, 45, 19, -30, -43, -6, 39, 37, -9, -44, -27, 22, 45, 14, -34, -41, \
       32, 16, -34, -40, 4, 44, 27, -24, -44, -8, 39, 36, -13, -45, -19, 31, 42, -1, -43, -30, 21, 45, 11, -37, -38, 10, 45, 22, -29, -43, -2, 41, \
       32, -18, -45, -14, 35, 39, -7, -44, -25, 26, 44, 6, -40, -34, 15, 45, 17, -33, -41, 3, 43, 28, -23, -45, -9, 38, 36, -12, -45, -20, 30, 42, \
       32, 14, -36, -37, 13, 45, 15, -36, -38, 12, 45, 16, -35, -38, 11, 45, 17, -34, -39, 10, 45, 18, -34, -39, 9, 45, 19, -33, -40, 8, 45, 20, \
       -32, -40, 7, 45, 21, -31, -41, 6, 44, 22, -30, -41, 4, 44, 23, -30, -42, 3, 44, 24, -29, -42, 2, 44, 25, -28, -43, 1, 43, 26, -27, -43, \
       32, 12, -39, -33, 21, 44, 2, -43, -25, 30, 41, -8, -45, -16, 36, 36, -17, -45, -7, 41, 29, -26, -43, 3, 44, 20, -34, -38, 13, 45, 11, -39, \
       -32, 22, 44, 1, -43, -24, 30, 40, -9, -45, -15, 37, 35, -18, -45, -6, 42, 28, -27, -42, 4, 45, 19, -34, -38, 14, 45, 10, -40, -31, 23, 44, \
       32, 10, -41, -28, 29, 40, -11, -45, -9, 41, 27, -30, -40, 12, 45, 8, -42, -26, 30, 39, -13, -45, -7, 42, 25, -31, -39, 14, 45, 6, -43, -24, \
       32, 38, -15, -45, -4, 43, 23, -33, -38, 16, 45, 3, -43, -22, 34, 37, -17, -45, -2, 44, 21, -34, -36, 18, 44, 1, -44, -20, 35, 36, -19, -44, \
       32, 8, -43, -22, 35, 34, -23, -42, 9, 45, 7, -43, -21, 36, 34, -24, -42, 10, 45, 6, -43, -20, 36, 33, -25, -41, 11, 45, 4, -44, -19, 37, \
       32, -26, -41, 12, 45, 3, -44, -18, 38, 31, -27, -40, 13, 45, 2, -44, -17, 38, 30, -28, -40, 14, 45, 1, -44, -16, 39, 30, -29, -39, 15, 45, \
       32, 6, -44, -16, 40, 26, -34, -34, 25, 40, -15, -44, 4, 45, 7, -44, -17, 39, 27, -33, -35, 24, 41, -14, -44, 3, 45, 8, -43, -18, 39, 28, \
       -32, -36, 23, 41, -13, -45, 2, 45, 9, -43, -19, 38, 29, -31, -36, 22, 42, -12, -45, 1, 45, 10, -43, -20, 38, 30, -30, -37, 21, 42, -11, -45, \
       32, 3, -45, -10, 43, 16, -41, -22, 38, 28, -34, -33, 29, 37, -23, -40, 17, 43, -11, -45, 4, 45, 2, -45, -9, 44, 15, -41, -21, 38, 27, -34, \
       -32, 30, 36, -24, -40, 18, 43, -12, -44, 6, 45, 1, -45, -8, 44, 14, -42, -20, 39, 26, -35, -31, 30, 36, -25, -39, 19, 42, -13, -44, 7, 45, \
       32, 1, -45, -3, 45, 6, -45, -8, 44, 10, -44, -12, 43, 14, -43, -16, 42, 18, -41, -20, 40, 22, -39, -24, 38, 26, -36, -28, 35, 30, -34, -31, \
       32, 33, -30, -34, 29, 36, -27, -37, 25, 38, -23, -39, 21, 40, -19, -41, 17, 42, -15, -43, 13, 44, -11, -44, 9, 45, -7, -45, 4, 45, -2, -45, \
       32, -1, -45, 3, 45, -6, -45, 8, 44, -10, -44, 12, 43, -14, -43, 16, 42, -18, -41, 20, 40, -22, -39, 24, 38, -26, -36, 28, 35, -30, -34, 31, \
       32, -33, -30, 34, 29, -36, -27, 37, 25, -38, -23, 39, 21, -40, -19, 41, 17, -42, -15, 43, 13, -44, -11, 44, 9, -45, -7, 45, 4, -45, -2, 45, \
       32, -3, -45, 10, 43, -16, -41, 22, 38, -28, -34, 33, 29, -37, -23, 40, 17, -43, -11, 45, 4, -45, 2, 45, -9, -44, 15, 41, -21, -38, 27, 34, \
       -32, -30, 36, 24, -40, -18, 43, 12, -44, -6, 45, -1, -45, 8, 44, -14, -42, 20, 39, -26, -35, 31, 30, -36, -25, 39, 19, -42, -13, 44, 7, -45, \
       32, -6, -44, 16, 40, -26, -34, 34, 25, -40, -15, 44, 4, -45, 7, 44, -17, -39, 27, 33, -35, -24, 41, 14, -44, -3, 45, -8, -43, 18, 39, -28, \
       -32, 36, 23, -41, -13, 45, 2, -45, 9, 43, -19, -38, 29, 31, -36, -22, 42, 12, -45, -1, 45, -10, -43, 20, 38, -30, -30, 37, 21, -42, -11, 45, \
       32, -8, -43, 22, 35, -34, -23, 42, 9, -45, 7, 43, -21, -36, 34, 24, -42, -10, 45, -6, -43, 20, 36, -33, -25, 41, 11, -45, 4, 44, -19, -37, \
       32, 26, -41, -12, 45, -3, -44, 18, 38, -31, -27, 40, 13, -45, 2, 44, -17, -38, 30, 28, -40, -14, 45, -1, -44, 16, 39, -30, -29, 39, 15, -45, \
       32, -10, -41, 28, 29, -40, -11, 45, -9, -41, 27, 30, -40, -12, 45, -8, -42, 26, 30, -39, -13, 45, -7, -42, 25, 31, -39, -14, 45, -6, -43, 24, \
       32, -38, -15, 45, -4, -43, 23, 33, -38, -16, 45, -3, -43, 22, 34, -37, -17, 45, -2, -44, 21, 34, -36, -18, 44, -1, -44, 20, 35, -36, -19, 44, \
       32, -12, -39, 33, 21, -44, 2, 43, -25, -30, 41, 8, -45, 16, 36, -36, -17, 45, -7, -41, 29, 26, -43, -3, 44, -20, -34, 38, 13, -45, 11, 39, \
       -32, -22, 44, -1, -43, 24, 30, -40, -9, 45, -15, -37, 35, 18, -45, 6, 42, -28, -27, 42, 4, -45, 19, 34, -38, -14, 45, -10, -40, 31, 23, -44, \
       32, -14, -36, 37, 13, -45, 15, 36, -38, -12, 45, -16, -35, 38, 11, -45, 17, 34, -39, -10, 45, -18, -34, 39, 9, -45, 19, 33, -40, -8, 45, -20, \
       -32, 40, 7, -45, 21, 31, -41, -6, 44, -22, -30, 41, 4, -44, 23, 30, -42, -3, 44, -24, -29, 42, 2, -44, 25, 28, -43, -1, 43, -26, -27, 43, \
       32, -16, -34, 40, 4, -44, 27, 24, -44, 8, 39, -36, -13, 45, -19, -31, 42, 1, -43, 30, 21, -45, 11, 37, -38, -10, 45, -22, -29, 43, -2, -41, \
       32, 18, -45, 14, 35, -39, -7, 44, -25, -26, 44, -6, -40, 34, 15, -45, 17, 33, -41, -3, 43, -28, -23, 45, -9, -38, 36, 12, -45, 20, 30, -42, \
       32, -18, -30, 43, -4, -39, 36, 10, -44, 26, 23, -45, 13, 34, -41, -1, 42, -33, -15, 45, -21, -28, 44, -8, -38, 38, 7, -44, 29, 20, -45, 16, \
       32, -42, 2, 40, -35, -12, 45, -24, -25, 45, -11, -36, 40, 3, -43, 31, 17, -45, 19, 30, -43, 6, 39, -37, -9, 44, -27, -22, 45, -14, -34, 41, \
       32, -20, -27, 45, -13, -33, 43, -6, -38, 39, 2, -41, 35, 10, -44, 30, 17, -45, 23, 24, -45, 16, 30, -44, 9, 36, -41, 1, 40, -37, -7, 43, \
       -32, -14, 45, -26, -21, 45, -19, -28, 44, -12, -34, 42, -4, -38, 39, 3, -42, 34, 11, -44, 29, 18, -45, 22, 25, -45, 15, 31, -43, 8, 36, -40, \
       32, -22, -23, 45, -21, -24, 45, -20, -25, 45, -19, -26, 45, -18, -27, 45, -17, -28, 45, -16, -29, 45, -15, -30, 44, -14, -30, 44, -13, -31, 44, -12, \
       -32, 44, -11, -33, 43, -10, -34, 43, -9, -34, 43, -8, -35, 42, -7, -36, 42, -6, -36, 41, -4, -37, 41, -3, -38, 40, -2, -38, 40, -1, -39, 39, \
       32, -24, -19, 45, -29, -14, 44, -33, -9, 42, -36, -3, 40, -39, 2, 37, -42, 8, 34, -44, 13, 30, -45, 18, 25, -45, 23, 20, -45, 28, 15, -44, \
       32, 10, -43, 36, 4, -40, 39, -1, -38, 41, -7, -34, 43, -12, -30, 45, -17, -26, 45, -22, -21, 45, -27, -16, 44, -31, -11, 43, -35, -6, 41, -38, \
       32, -26, -15, 44, -35, -3, 39, -41, 9, 31, -45, 20, 21, -45, 30, 10, -42, 38, -2, -36, 43, -14, -27, 45, -25, -16, 44, -34, -4, 39, -41, 8, \
       32, -45, 19, 22, -45, 30, 11, -42, 38, -1, -36, 43, -13, -28, 45, -24, -17, 44, -34, -6, 40, -40, 7, 33, -44, 18, 23, -45, 29, 12, -43, 37, \
       32, -28, -11, 41, -40, 8, 30, -45, 25, 14, -43, 38, -4, -33, 45, -22, -17, 44, -36, 1, 35, -44, 19, 20, -44, 34, 2, -37, 43, -16, -23, 45, \
       -32, -6, 39, -42, 13, 26, -45, 30, 9, -40, 41, -10, -29, 45, -27, -12, 42, -39, 7, 31, -45, 24, 15, -43, 38, -3, -34, 45, -21, -18, 44, -36, \
       32, -30, -7, 38, -43, 18, 19, -44, 38, -6, -30, 45, -29, -8, 39, -43, 17, 20, -44, 37, -4, -31, 45, -28, -9, 39, -43, 16, 21, -44, 36, -3, \
       -32, 45, -27, -10, 40, -42, 15, 22, -44, 36, -2, -33, 45, -26, -11, 40, -42, 14, 23, -45, 35, -1, -34, 45, -25, -12, 41, -41, 13, 24, -45, 34, \
       32, -31, -2, 34, -45, 28, 7, -37, 44, -24, -11, 39, -43, 20, 15, -41, 42, -16, -19, 43, -40, 12, 23, -44, 38, -8, -27, 45, -35, 3, 30, -45, \
       32, 1, -34, 45, -29, -6, 36, -45, 25, 10, -39, 44, -21, -14, 41, -42, 17, 18, -43, 40, -13, -22, 44, -38, 9, 26, -45, 36, -4, -30, 45, -33, \
       32, -33, 2, 30, -45, 36, -7, -26, 44, -38, 11, 22, -43, 40, -15, -18, 42, -42, 19, 14, -40, 44, -23, -10, 38, -45, 27, 6, -35, 45, -30, -1, \
       32, -45, 34, -3, -29, 45, -36, 8, 25, -44, 39, -12, -21, 43, -41, 16, 17, -41, 43, -20, -13, 39, -44, 24, 9, -37, 45, -28, -4, 34, -45, 31, \
       32, -34, 7, 24, -43, 41, -19, -12, 38, -45, 30, -1, -29, 45, -39, 14, 17, -40, 44, -26, -4, 33, -45, 36, -9, -22, 43, -42, 21, 10, -36, 45, \
       -32, 3, 27, -44, 40, -16, -15, 39, -44, 28, 2, -31, 45, -37, 11, 20, -42, 43, -23, -8, 35, -45, 34, -6, -25, 44, -41, 18, 13, -38, 45, -30, \
       32, -36, 11, 18, -40, 45, -30, 3, 25, -43, 43, -24, -4, 31, -45, 39, -17, -12, 36, -45, 35, -10, -19, 40, -44, 30, -2, -26, 43, -42, 23, 6, \
       -32, 45, -39, 16, 13, -37, 45, -34, 9, 20, -41, 44, -29, 1, 27, -44, 42, -22, -7, 33, -45, 38, -15, -14, 38, -45, 34, -8, -21, 41, -44, 28, \
       32, -37, 15, 12, -35, 45, -39, 18, 9, -33, 45, -40, 21, 6, -30, 44, -42, 24, 2, -28, 43, -43, 27, -1, -25, 42, -44, 30, -4, -22, 41, -45, \
       32, -8, -19, 39, -45, 34, -11, -16, 38, -45, 36, -14, -13, 36, -45, 38, -17, -10, 34, -45, 40, -20, -7, 31, -44, 41, -23, -3, 29, -44, 43, -26, \
       32, -38, 19, 6, -29, 43, -44, 31, -9, -16, 36, -45, 40, -22, -2, 26, -42, 45, -34, 12, 13, -34, 45, -41, 25, -1, -23, 40, -45, 36, -15, -10, \
       32, -44, 43, -28, 4, 20, -39, 45, -38, 18, 7, -30, 43, -44, 30, -8, -17, 37, -45, 39, -21, -3, 27, -42, 44, -33, 11, 14, -35, 45, -41, 24, \
       32, -39, 23, -1, -21, 38, -45, 40, -25, 3, 19, -37, 45, -41, 27, -6, -17, 36, -45, 42, -29, 8, 15, -34, 44, -43, 30, -10, -13, 33, -44, 44, \
       -32, 12, 11, -31, 43, -44, 34, -14, -9, 30, -43, 45, -35, 16, 7, -28, 42, -45, 36, -18, -4, 26, -41, 45, -38, 20, 2, -24, 40, -45, 39, -22, \
       32, -40, 27, -8, -13, 31, -43, 45, -38, 22, -2, -18, 35, -44, 44, -34, 17, 3, -23, 38, -45, 42, -30, 12, 9, -28, 41, -45, 40, -26, 7, 14, \
       -32, 43, -45, 37, -21, 1, 19, -36, 44, -44, 34, -16, -4, 24, -39, 45, -42, 30, -11, -10, 29, -41, 45, -39, 25, -6, -15, 33, -43, 45, -36, 20, \
       32, -41, 30, -14, -4, 22, -36, 44, -44, 37, -23, 6, 13, -30, 41, -45, 42, -31, 15, 3, -21, 36, -44, 45, -38, 24, -7, -12, 29, -40, 45, -42, \
       32, -16, -2, 20, -35, 44, -45, 38, -25, 8, 11, -28, 40, -45, 43, -33, 17, 1, -19, 34, -43, 45, -39, 26, -9, -10, 27, -39, 45, -43, 34, -18, \
       32, -42, 34, -20, 4, 12, -27, 38, -44, 45, -39, 28, -13, -3, 19, -33, 42, -45, 43, -34, 21, -6, -11, 26, -38, 44, -45, 39, -29, 14, 2, -18, \
       32, -41, 45, -43, 35, -22, 7, 10, -25, 37, -44, 45, -40, 30, -15, -1, 17, -31, 41, -45, 43, -36, 23, -8, -9, 24, -36, 44, -45, 40, -30, 16, \
       32, -43, 36, -26, 13, 1, -15, 28, -38, 44, -45, 42, -35, 24, -11, -3, 17, -30, 39, -44, 45, -41, 34, -22, 9, 6, -19, 31, -40, 45, -45, 40, \
       -32, 20, -7, -8, 21, -33, 41, -45, 44, -39, 30, -18, 4, 10, -23, 34, -42, 45, -44, 38, -29, 16, -2, -12, 25, -36, 43, -45, 43, -37, 27, -14, \
       32, -44, 39, -31, 21, -10, -2, 14, -25, 34, -41, 45, -45, 42, -36, 28, -17, 6, 7, -18, 29, -37, 43, -45, 44, -40, 34, -24, 13, -1, -11, 22, \
       -32, 39, -44, 45, -43, 38, -30, 20, -9, -3, 15, -26, 35, -41, 45, -45, 42, -36, 27, -16, 4, 8, -19, 30, -38, 43, -45, 44, -40, 33, -23, 12, \
       32, -44, 41, -36, 29, -20, 11, -1, -9, 18, -27, 34, -40, 44, -45, 45, -42, 37, -30, 22, -13, 3, 7, -16, 25, -33, 39, -43, 45, -45, 43, -38, \
       32, -24, 15, -6, -4, 14, -23, 31, -38, 42, -45, 45, -43, 39, -34, 26, -17, 8, 2, -12, 21, -30, 36, -41, 44, -45, 44, -40, 35, -28, 19, -10, \
       32, -45, 43, -39, 35, -30, 23, -16, 9, -1, -7, 14, -21, 28, -34, 38, -42, 44, -45, 45, -43, 40, -36, 31, -25, 18, -11, 3, 4, -12, 19, -26, \
       32, -37, 41, -44, 45, -45, 44, -41, 38, -33, 27, -20, 13, -6, -2, 10, -17, 24, -30, 36, -40, 43, -45, 45, -44, 42, -39, 34, -29, 22, -15, 8, \
       32, -45, 44, -42, 40, -37, 34, -30, 25, -20, 15, -10, 4, 1, -7, 12, -17, 22, -27, 31, -35, 38, -41, 43, -44, 45, -45, 45, -43, 41, -39, 36, \
       -32, 28, -23, 18, -13, 8, -2, -3, 9, -14, 19, -24, 29, -33, 36, -39, 42, -44, 45, -45, 45, -44, 43, -40, 38, -34, 30, -26, 21, -16, 11, -6, \
       32, -45, 45, -44, 43, -42, 41, -39, 38, -36, 34, -31, 29, -26, 23, -20, 17, -14, 11, -8, 4, -1, -2, 6, -9, 12, -15, 18, -21, 24, -27, 30, \
       -32, 34, -36, 38, -40, 41, -43, 44, -44, 45, -45, 45, -45, 45, -44, 43, -42, 40, -39, 37, -35, 33, -30, 28, -25, 22, -19, 16, -13, 10, -7, 3, \
       32, -45, 45, -45, 45, -45, 45, -45, 44, -44, 44, -44, 43, -43, 43, -42, 42, -41, 41, -40, 40, -39, 39, -38, 38, -37, 36, -36, 35, -34, 34, -33, \
       32, -31, 30, -30, 29, -28, 27, -26, 25, -24, 23, -22, 21, -20, 19, -18, 17, -16, 15, -14, 13, -12, 11, -10, 9, -8, 7, -6, 4, -3, 2, -1

//*************************************************************************************************
//void tx_dct2_pb64_arm64(s16 *src, s16 *dst, int line, int limit_line, int shift);
//x0: coeff blk, 16 bit
//x1: resi blk, 16 bit
//x2: blk width
//x3: limit_line
//x4: shift
//*************************************************************************************************
function tx_dct2_pb64_arm64
    lsl x2, x2, #1
    adr x7, tx_dct2_pb64_coef
    mov x12, #128    //i_src

tx_dct2_pb64_loopk:
    mov x8, #0
    mov x11, x1
tx_dct2_pb64_loopj:
    mov x5, #0
    movi v24.16b, #0
    movi v25.16b, #0
    movi v26.16b, #0
    movi v27.16b, #0
    movi v28.16b, #0
    movi v29.16b, #0
    movi v30.16b, #0
    movi v31.16b, #0
tx_dct2_pb64_loopi:
    //load src
    add x6, x0, x5
    ld1 {v0.8h}, [x6], x12
    ld1 {v1.8h}, [x6], x12
    ld1 {v2.8h}, [x6], x12
    ld1 {v3.8h}, [x6], x12

    add x9, x7, x8
    ld1 {v16.8h}, [x9], x12
    ld1 {v17.8h}, [x9], x12
    ld1 {v18.8h}, [x9], x12
    ld1 {v19.8h}, [x9], x12
    ld1 {v20.8h}, [x9], x12
    ld1 {v21.8h}, [x9], x12
    ld1 {v22.8h}, [x9], x12
    ld1 {v23.8h}, [x9], x12

    smlal v24.4s, v16.4h, v0.h[0]
    smlal2 v25.4s, v16.8h, v0.h[0]
    smlal v24.4s, v17.4h, v0.h[1]
    smlal2 v25.4s, v17.8h, v0.h[1]
    smlal v24.4s, v18.4h, v0.h[2]
    smlal2 v25.4s, v18.8h, v0.h[2]
    smlal v24.4s, v19.4h, v0.h[3]
    smlal2 v25.4s, v19.8h, v0.h[3]
    smlal v24.4s, v20.4h, v0.h[4]
    smlal2 v25.4s, v20.8h, v0.h[4]
    smlal v24.4s, v21.4h, v0.h[5]
    smlal2 v25.4s, v21.8h, v0.h[5]
    smlal v24.4s, v22.4h, v0.h[6]
    smlal2 v25.4s, v22.8h, v0.h[6]
    smlal v24.4s, v23.4h, v0.h[7]
    smlal2 v25.4s, v23.8h, v0.h[7]

    smlal v26.4s, v16.4h, v1.h[0]
    smlal2 v27.4s, v16.8h, v1.h[0]
    smlal v26.4s, v17.4h, v1.h[1]
    smlal2 v27.4s, v17.8h, v1.h[1]
    smlal v26.4s, v18.4h, v1.h[2]
    smlal2 v27.4s, v18.8h, v1.h[2]
    smlal v26.4s, v19.4h, v1.h[3]
    smlal2 v27.4s, v19.8h, v1.h[3]
    smlal v26.4s, v20.4h, v1.h[4]
    smlal2 v27.4s, v20.8h, v1.h[4]
    smlal v26.4s, v21.4h, v1.h[5]
    smlal2 v27.4s, v21.8h, v1.h[5]
    smlal v26.4s, v22.4h, v1.h[6]
    smlal2 v27.4s, v22.8h, v1.h[6]
    smlal v26.4s, v23.4h, v1.h[7]
    smlal2 v27.4s, v23.8h, v1.h[7]

    smlal v28.4s, v16.4h, v2.h[0]
    smlal2 v29.4s, v16.8h, v2.h[0]
    smlal v28.4s, v17.4h, v2.h[1]
    smlal2 v29.4s, v17.8h, v2.h[1]
    smlal v28.4s, v18.4h, v2.h[2]
    smlal2 v29.4s, v18.8h, v2.h[2]
    smlal v28.4s, v19.4h, v2.h[3]
    smlal2 v29.4s, v19.8h, v2.h[3]
    smlal v28.4s, v20.4h, v2.h[4]
    smlal2 v29.4s, v20.8h, v2.h[4]
    smlal v28.4s, v21.4h, v2.h[5]
    smlal2 v29.4s, v21.8h, v2.h[5]
    smlal v28.4s, v22.4h, v2.h[6]
    smlal2 v29.4s, v22.8h, v2.h[6]
    smlal v28.4s, v23.4h, v2.h[7]
    smlal2 v29.4s, v23.8h, v2.h[7]

    smlal v30.4s, v16.4h, v3.h[0]
    smlal2 v31.4s, v16.8h, v3.h[0]
    smlal v30.4s, v17.4h, v3.h[1]
    smlal2 v31.4s, v17.8h, v3.h[1]
    smlal v30.4s, v18.4h, v3.h[2]
    smlal2 v31.4s, v18.8h, v3.h[2]
    smlal v30.4s, v19.4h, v3.h[3]
    smlal2 v31.4s, v19.8h, v3.h[3]
    smlal v30.4s, v20.4h, v3.h[4]
    smlal2 v31.4s, v20.8h, v3.h[4]
    smlal v30.4s, v21.4h, v3.h[5]
    smlal2 v31.4s, v21.8h, v3.h[5]
    smlal v30.4s, v22.4h, v3.h[6]
    smlal2 v31.4s, v22.8h, v3.h[6]
    smlal v30.4s, v23.4h, v3.h[7]
    smlal2 v31.4s, v23.8h, v3.h[7]

    add x5, x5, #16
    add x8, x8, #1024
    cmp x5, #128
    bne tx_dct2_pb64_loopi

    cmp w4, #4
    beq tx_dct2_pb64_shift4
    cmp w4, #11
    beq tx_dct2_pb64_shift11

    sqrshrn v0.4h, v24.4s, #6
    sqrshrn v1.4h, v26.4s, #6
    sqrshrn v2.4h, v28.4s, #6
    sqrshrn v3.4h, v30.4s, #6
    sqrshrn v4.4h, v25.4s, #6
    sqrshrn v5.4h, v27.4s, #6
    sqrshrn v6.4h, v29.4s, #6
    sqrshrn v7.4h, v31.4s, #6
    b tx_dct2_pb64_end

tx_dct2_pb64_shift4:
    sqrshrn v0.4h, v24.4s, #4
    sqrshrn v1.4h, v26.4s, #4
    sqrshrn v2.4h, v28.4s, #4
    sqrshrn v3.4h, v30.4s, #4
    sqrshrn v4.4h, v25.4s, #4
    sqrshrn v5.4h, v27.4s, #4
    sqrshrn v6.4h, v29.4s, #4
    sqrshrn v7.4h, v31.4s, #4
    b tx_dct2_pb64_end

tx_dct2_pb64_shift11:
    sqrshrn v0.4h, v24.4s, #11
    sqrshrn v1.4h, v26.4s, #11
    sqrshrn v2.4h, v28.4s, #11
    sqrshrn v3.4h, v30.4s, #11
    sqrshrn v4.4h, v25.4s, #11
    sqrshrn v5.4h, v27.4s, #11
    sqrshrn v6.4h, v29.4s, #11
    sqrshrn v7.4h, v31.4s, #11

tx_dct2_pb64_end:
    st4 {v0.h - v3.h}[0], [x11], x2
    st4 {v0.h - v3.h}[1], [x11], x2
    st4 {v0.h - v3.h}[2], [x11], x2
    st4 {v0.h - v3.h}[3], [x11], x2
    st4 {v4.h - v7.h}[0], [x11], x2
    st4 {v4.h - v7.h}[1], [x11], x2
    st4 {v4.h - v7.h}[2], [x11], x2
    st4 {v4.h - v7.h}[3], [x11], x2

    sub x8, x8, #4095
    sub x8, x8, #4081       //8176=4095 + 4081
    cmp x8, #128
    bne tx_dct2_pb64_loopj

    add x0, x0, #512
    add x1, x1, #8
    subs x3, x3, #4
    bgt tx_dct2_pb64_loopk

    ret
#endif
